In [1]:
# Import Packages
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import altair as alt
import us.states
In [2]:
# Read in data
flight = pd.read_csv("On_Time_Marketing_Carrier_On_Time_Performance_2022_12_final.csv")

flight
Out[2]:
Year Month DayofMonth DayOfWeek FlightDate Marketing_Airline_Network DOT_ID_Marketing_Airline Flight_Number_Marketing_Airline Origin OriginCityName ... DistanceGroup CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay FirstDepTime TotalAddGTime LongestAddGTime DivAirportLandings
0 2022 12 19 1 12/19/2022 DL 19790 4628 BDL Hartford, CT ... 1 NaN NaN NaN NaN NaN NaN NaN NaN 0
1 2022 12 20 2 12/20/2022 DL 19790 4628 BDL Hartford, CT ... 1 NaN NaN NaN NaN NaN NaN NaN NaN 0
2 2022 12 21 3 12/21/2022 DL 19790 4628 BDL Hartford, CT ... 1 NaN NaN NaN NaN NaN NaN NaN NaN 0
3 2022 12 22 4 12/22/2022 DL 19790 4628 BDL Hartford, CT ... 1 NaN NaN NaN NaN NaN NaN NaN NaN 0
4 2022 12 23 5 12/23/2022 DL 19790 4628 BDL Hartford, CT ... 1 NaN NaN NaN NaN NaN NaN NaN NaN 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
576822 2022 12 27 2 12/27/2022 NK 20416 913 MCI Kansas City, MO ... 6 22.0 0.0 1.0 0.0 14.0 NaN NaN NaN 0
576823 2022 12 28 3 12/28/2022 NK 20416 913 MCI Kansas City, MO ... 6 NaN NaN NaN NaN NaN NaN NaN NaN 0
576824 2022 12 29 4 12/29/2022 NK 20416 913 MCI Kansas City, MO ... 6 0.0 0.0 11.0 0.0 11.0 NaN NaN NaN 0
576825 2022 12 30 5 12/30/2022 NK 20416 913 MCI Kansas City, MO ... 6 NaN NaN NaN NaN NaN NaN NaN NaN 0
576826 2022 12 31 6 12/31/2022 NK 20416 913 MCI Kansas City, MO ... 6 NaN NaN NaN NaN NaN NaN NaN NaN 0

576827 rows × 50 columns

In [3]:
df = flight

Flight volume by day of the week¶

Create a bar chart to show the number of flights on each day of the week. This can help identify which days have the most and least flights.

In [4]:
import plotly.express as px
import pandas as pd

day_counts = df["DayOfWeek"].value_counts().sort_index()
fig = px.bar(day_counts, x=day_counts.index, y=day_counts.values, labels={"x": "Day of Week", "y": "Number of Flights"})

# Customize x-axis tick labels and title
fig.update_xaxes(
    tickvals=list(range(1, 8)),
    ticktext=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"],
    title="Days of the Week"
)

fig.show()

Flight volume by airline¶

Create a bar chart to show the number of flights per airline. This can help identify which airlines have the most flights in the dataset.

In [5]:
airline_counts = df["Marketing_Airline_Network"].value_counts()
fig = px.bar(airline_counts, x=airline_counts.index, y=airline_counts.values, labels={"x": "Airline", "y": "Number of Flights"})

# Add a title to the x-axis
fig.update_xaxes(title="Airlines")

fig.show()

Average delay time by airline¶

Create a bar chart to visualize the average delay time for each airline.

In [6]:
average_delays = df.groupby("Marketing_Airline_Network")["DepDelayMinutes"].mean()
fig = px.bar(average_delays, x=average_delays.index, y=average_delays.values, labels={"x": "Airline", "y": "Average Delay (minutes)"})

# Add a title to the x-axis
fig.update_xaxes(title="Airlines")

fig.show()

Flight volume by origin city¶

Create a bar chart or pie chart to show the number of flights per origin city. This can help identify which cities have the most flights in the dataset.

In [7]:
city_counts = df["OriginCityName"].value_counts()
fig = px.bar(city_counts, x=city_counts.index, y=city_counts.values, labels={"x": "Origin City", "y": "Number of Flights"})

# Add a title to the x-axis
fig.update_xaxes(title="Origin City")

fig.show()

Flight volume by distance group¶

Create a bar chart to show the number of flights per distance group. This can help identify the distribution of flight distances in the dataset.

In [8]:
distance_counts = df["DistanceGroup"].value_counts().sort_index()
fig = px.bar(distance_counts, x=distance_counts.index, y=distance_counts.values, labels={"x": "Distance Group", "y": "Number of Flights"})

# Add a title to the x-axis
fig.update_xaxes(title="Distance Groups")

fig.show()

Heatmap of flight delays¶

Create a heatmap to visualize the delays by day of the week and airline. This can help identify patterns in delays for specific airlines on certain days.

In [9]:
import plotly.express as px

day_mapping = {1: 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri', 6: 'Sat', 7: 'Sun'}

heatmap_data = df.groupby(["DayOfWeek", "Marketing_Airline_Network"])["DepDelayMinutes"].mean().reset_index()
heatmap_data['DayOfWeek'] = heatmap_data['DayOfWeek'].map(day_mapping)

fig = px.density_heatmap(heatmap_data, 
                          x="DayOfWeek", 
                          y="Marketing_Airline_Network", 
                          z="DepDelayMinutes", 
                          nbinsx=7, 
                          color_continuous_scale=px.colors.diverging.Tealrose,
                          labels={"DayOfWeek": "Day of Week", "Marketing_Airline_Network": "Airline", "DepDelayMinutes": "Average Delay (minutes)"})

fig.update_layout(
    title="Average Delay by Day of Week and Airline",
    coloraxis_colorbar=dict(
        title="Average Delay (minutes)"
    )
)

fig.show()

Time series of daily flight volume¶

Create a line chart to show the daily flight volume over the period covered by the dataset. This can help identify trends in flight volume over time.

In [10]:
daily_flight_volume = df["FlightDate"].value_counts().sort_index()
fig = px.line(daily_flight_volume, x=daily_flight_volume.index, y=daily_flight_volume.values, labels={"x": "Date", "y": "Number of Flights"})

# Add a title to the x-axis
fig.update_xaxes(title="Flight Date")

fig.show()

Box plot of delays by airline¶

Create a box plot to visualize the distribution of delays for each airline. This can help identify which airlines have the most variation in delay times.

In [11]:
fig = px.box(df, x="Marketing_Airline_Network", y="DepDelayMinutes", labels={"Marketing_Airline_Network": "Airline", "DepDelayMinutes": "Delay (minutes)"})
fig.show()

Scatter plot of flight distances vs. delays¶

Create a scatter plot to show the relationship between flight distance and delay time. This can help identify whether longer flights tend to have more delays.

In [12]:
import plotly.express as px

fig = px.scatter(df, 
                 x="Distance", 
                 y="DepDelayMinutes", 
                 color="DepDelayMinutes",
                 color_continuous_scale=px.colors.sequential.Pinkyl,
                 labels={"Distance": "Flight Distance (miles)", "DepDelayMinutes": "Delay (minutes)"}
                )

fig.show()

Number of Flights by State¶

In [13]:
state_counts = df["OriginState"].value_counts().reset_index()
state_counts.columns = ["State", "NumFlights"]
In [14]:
import plotly.express as px

fig = px.choropleth(state_counts, 
                    locations="State", 
                    color="NumFlights", 
                    locationmode="USA-states",
                    scope="usa",
                    color_continuous_scale=px.colors.sequential.Teal,
                    labels={"NumFlights": "Number of Flights"},
                    title="Number of Flights by State")

fig.update_layout(margin={"r": 0, "t": 30, "l": 0, "b": 0})
fig.show()

Map plot with the average delay time by state¶

In [15]:
state_delays = df.groupby("OriginState")["DepDelayMinutes"].mean().reset_index()
state_delays.columns = ["State", "AvgDelay"]
In [16]:
import plotly.express as px

fig = px.choropleth(state_delays, 
                    locations="State", 
                    color="AvgDelay", 
                    locationmode="USA-states",
                    scope="usa",
                    color_continuous_scale=px.colors.diverging.Tealrose,
                    labels={"AvgDelay": "Average Delay (minutes)"},
                    title="Average Delay Time by State")

fig.update_layout(margin={"r": 0, "t": 30, "l": 0, "b": 0})
fig.show()

Top 100 flight lines¶

In [17]:
# import pandas as pd
# import plotly.express as px
# from geopy.geocoders import Nominatim

# geolocator = Nominatim(user_agent="my_flight_analysis_app")

# def get_coordinates(city_name):
#     location = geolocator.geocode(city_name)
#     if location:
#         return location.latitude, location.longitude
#     else:
#         return None, None
In [18]:
# route_counts = df.groupby(["OriginCityName", "DestCityName"]).size().reset_index(name="NumFlights")
In [19]:
# N = 100  # Change this value to display a different number of top routes
# top_routes = route_counts.nlargest(N, "NumFlights")
In [20]:
# top_routes["OriginLatitude"], top_routes["OriginLongitude"] = zip(*top_routes["OriginCityName"].apply(get_coordinates))
# top_routes["DestLatitude"], top_routes["DestLongitude"] = zip(*top_routes["DestCityName"].apply(get_coordinates))
In [21]:
# fig = px.scatter_geo(top_routes,
#                      lat="OriginLatitude",
#                      lon="OriginLongitude",
#                      hover_name="OriginCityName",
#                      size_max=5,
#                      projection="natural earth")

# for _, row in top_routes.iterrows():
#     fig.add_trace(px.line_geo(lat=[row["OriginLatitude"], row["DestLatitude"]],
#                               lon=[row["OriginLongitude"], row["DestLongitude"]])
#                               .data[0])

# # Add city names to the bubbles
# for _, row in top_routes.iterrows():
#     fig.add_trace(
#         go.Scattergeo(
#             lat=[row["OriginLatitude"]],
#             lon=[row["OriginLongitude"]],
#             text=[row["OriginCityName"]],
#             mode="text",
#             textfont=dict(size=10, color="black"),
#             showlegend=False,
#             textposition="bottom center"
#         )
#     )

# fig.show()
In [22]:
# import plotly.graph_objs as go

# fig = go.Figure()

# # Customize map's appearance
# fig.update_geos(
#     resolution=50,
#     showcoastlines=True, coastlinecolor="Thistle",
#     showland=True, landcolor="LightGreen",
#     showocean=True, oceancolor="Azure",
#     showlakes=True, lakecolor="LightBlue",
#     showrivers=True, rivercolor="LightSteelBlue",
#     showcountries=True, countrycolor="DarkOrange",
#     showsubunits=True, subunitcolor="DarkOrange",
#     projection_type="natural earth"
# )

# # Add flight routes
# for _, row in top_routes.iterrows():
#     fig.add_trace(px.line_geo(lat=[row["OriginLatitude"], row["DestLatitude"]],
#                               lon=[row["OriginLongitude"], row["DestLongitude"]])
#                               .data[0])

# # Add city names to the bubbles
# for _, row in top_routes.iterrows():
#     fig.add_trace(
#         go.Scattergeo(
#             lat=[row["OriginLatitude"]],
#             lon=[row["OriginLongitude"]],
#             text=[row["OriginCityName"]],
#             mode="text",
#             textfont=dict(size=10, color="black"),
#             showlegend=False,
#             textposition="bottom center"
#         )
#     )

# # Add city markers with hover text
# fig.add_trace(
#     go.Scattergeo(
#         lat=top_routes["OriginLatitude"],
#         lon=top_routes["OriginLongitude"],
#         hovertext=top_routes["OriginCityName"],
#         mode="markers",
#         marker=dict(size=6, color="red", symbol="circle", line=dict(width=1, color="black")),
#         showlegend=False
#     )
# )

# # Customize layout
# fig.update_layout(
#     title="100 Top Flight Routes",
#     title_x=0.5,
#     geo=dict(
#         scope="world",
#         projection=dict(type="natural earth"),
#         showland=True,
#         landcolor="rgb(243, 243, 243)",
#         countrycolor="rgb(204, 204, 204)",
#     ),
#     margin=dict(t=50, b=0, l=0, r=0)
# )

# fig.show()
In [ ]: